#Abstract
Customer Lifetime Value (CLV) is an important metric for businesses. CLV represents the total worth/profit from a particular customer to a business over a period of time. It provides a measure of relative importance of existing customers to a business in terms of revenue, churn, interaction, and loyalty. This metric informs business leaders on decisions regarding strategic focus on existing customer retention or new customer acquisition; marketing strategies targeted at specific segments of the populace; and recommended advertisement budget projections. In short, CLV is a powerful measure of customer value to a business based on historic transaction data; a metric which is hard to encapsulate in existing summary statistics such as total transactions, average purchase amount per transaction, or percentage of total sales. This project aims to predict the CLV of customers based on 1-year transaction data. First, we will extract customer-level information on purchases and returns from the data set to generate features per customer including Average Order Value (AOV), Days to Purchase (D2P), Gross Margin, and Recency, Frequency, Monetary (RFM) Metrics. Based on these features, we will build a regression model to estimate and predict CLV. After validating model assumptions and conducting goodness of fit analysis, we will compare our model to existing CLV models from other researchers who did not include returns as a part of their analysis. We will use both models to predict CLV of the same data set and analyze how differing methodologies and model assumptions influenced difference in CLV predictions.
#Data Source https://archive.ics.uci.edu/ml/datasets/online+retail
#Read Data file in
# Import library you may need
library(car)
library(dplyr)
library(TSstudio)
library(ggplot2)
library(tibble)
# Read the data set
raw <- read.csv("Online Retail.csv")
head(raw)
## InvoiceNo StockCode Description Quantity InvoiceDate
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010
## 2 536365 71053 WHITE METAL LANTERN 6 12/1/2010
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 12/1/2010
## UnitPrice CustomerID Country
## 1 2.55 17850 United Kingdom
## 2 3.39 17850 United Kingdom
## 3 2.75 17850 United Kingdom
## 4 3.39 17850 United Kingdom
## 5 3.39 17850 United Kingdom
## 6 7.65 17850 United Kingdom
#Data Discription InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter ‘c’, it indicates a cancellation.
StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
Description: Product (item) name. Nominal.
Quantity: The quantities of each product (item) per transaction. Numeric.
InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.
UnitPrice: Unit price. Numeric, Product price per unit in sterling.
CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
Country: Country name. Nominal, the name of the country where each customer resides.
colnames(raw, do.NULL = TRUE, prefix = "col")
## [1] "InvoiceNo" "StockCode" "Description" "Quantity" "InvoiceDate"
## [6] "UnitPrice" "CustomerID" "Country"
num_row <- nrow(raw)
cat('Number of rows:',num_row)
## Number of rows: 541909
#Convert nominal data to catagorical data
raw$InvoiceNo<-as.factor(raw$InvoiceNo)
raw$StockCode<-as.factor(raw$StockCode)
raw$Description<-as.factor(raw$Description)
raw$CustomerID<-as.factor(raw$CustomerID)
raw$Country<-as.factor(raw$Country)
raw$InvoiceDate <- as.Date(raw$InvoiceDate, "%m/%d/%Y")
sapply(raw, class)
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## "factor" "factor" "factor" "integer" "Date" "numeric"
## CustomerID Country
## "factor" "factor"
#Removing redundant rows of the data
rdd_raw <- distinct(raw)
num_row_rdd <- nrow(rdd_raw)
removed <- num_row - num_row_rdd
cat('Number of redundant rows been removed:',removed)
## Number of redundant rows been removed: 5269
#Removing columns that contain missing value(s)
missing_raw <- na.omit(rdd_raw)
num_row_missing <- nrow(missing_raw)
removed2 <- num_row_rdd-num_row_missing
cat('Number of rows contains missing values been further removed:',removed2)
## Number of rows contains missing values been further removed: 135037
cleaned_df<-missing_raw
head(cleaned_df)
## InvoiceNo StockCode Description Quantity InvoiceDate
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01
## 2 536365 71053 WHITE METAL LANTERN 6 2010-12-01
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 2010-12-01
## UnitPrice CustomerID Country
## 1 2.55 17850 United Kingdom
## 2 3.39 17850 United Kingdom
## 3 2.75 17850 United Kingdom
## 4 3.39 17850 United Kingdom
## 5 3.39 17850 United Kingdom
## 6 7.65 17850 United Kingdom
nrow(cleaned_df)
## [1] 401603
#Segment the Data into First (Initial) and Second (Final) 6 Months (i.e. May 31, 2011)
max(which((cleaned_df$InvoiceDate=="2011-05-31")))
## [1] 147327
initial_df<-cleaned_df[1:max(which((cleaned_df$InvoiceDate=="2011-05-31"))),]
final_df<-cleaned_df[min(which((cleaned_df$InvoiceDate=="2011-06-01"))):nrow(cleaned_df),]
#Explore the data ##Count catagorical data
count_unique<-rapply(cleaned_df[c(1,2,7,8)],function(x) length(unique(x)))
initial_unique<-rapply(initial_df[c(1,2,7,8)],function(x) length(unique(x)))
final_unique<-rapply(final_df[c(1,2,7,8)],function(x) length(unique(x)))
count_unique
## InvoiceNo StockCode CustomerID Country
## 22190 3684 4372 37
initial_unique
## InvoiceNo StockCode CustomerID Country
## 8998 3132 2767 35
final_unique
## InvoiceNo StockCode CustomerID Country
## 13192 3267 3577 32
By counting the unique appearance of the data, we can find out that: 1. The data frame contains 22190 unique transactions. 2. The data frame contains 3684 kinds of unique products. 3. The data frame contains 4372 unique customers. 4. The data is collected across 37 countries.
##Time series plot of sales over entire time span
all_date <- as.Date(cleaned_df$InvoiceDate)
quantity_over_time <- aggregate(cleaned_df$Quantity, by=list(all_date), sum)
qov_plot<-ts_plot(quantity_over_time,
title = "Daily Sales Quantity",
Xtitle = "Time",
Ytitle = "Number of Items")
qov_plot
Plot aggregated daily sales quantity. We do not see abnormal behaviors across time.
##Order quantity from top 10 countries
country_table<-table(cleaned_df$Country)
country_table<-sort(country_table)
ranked<-country_table[c(29:38)]
print(ranked)
##
## Australia Portugal Switzerland Belgium Netherlands
## 1258 1471 1877 2069 2371
## Spain EIRE France Germany United Kingdom
## 2528 7475 8475 9480 356727
barplot(ranked,main = "Order Quantity from Top 10 Countries",las=2,cex.names=0.6)
Ranking countries that has the most orders. United Kingdom ordered the most, and they take a huge portion of the data. Might be careful while performing regression, since huge amount of orders came from UK.
##Number of sales and return for Initial Six Months
initial_return<-nrow(initial_df[initial_df$Quantity<0,])
initial_sale<-nrow(initial_df[initial_df$Quantity>0,])
cat('the number of sales of first six months of the transactions is ', initial_sale)
## the number of sales of first six months of the transactions is 143753
cat('\nthe number of returns of first six months of the transactions is ', initial_return)
##
## the number of returns of first six months of the transactions is 3574
##Transaction Frequency of First Six Months
#Unique Customers in First Six Months
initial_customer_table<-table(initial_df$CustomerID)
initial_customer_all<-as.data.frame(initial_customer_table)
initial_customer=filter(initial_customer_all,initial_customer_all$Freq>0)
#First 6 Months of return data frame
initial_return_df<-initial_df[initial_df$Quantity<0,]
initial_return_table<-table(initial_return_df$CustomerID)
initial_customer_return<-as.data.frame(initial_return_table)
initial_return=filter(initial_customer_return,initial_customer_return$Freq>0)
#First 6 Months of sales data frame
initial_sale_df<-initial_df[initial_df$Quantity>0,]
initial_sale_table<-table(initial_sale_df$CustomerID)
initial_customer_sale<-as.data.frame(initial_sale_table)
initial_sale=filter(initial_customer_sale,initial_customer_sale$Freq>0)
#Histogram of First Six Months
ggplot(initial_customer, aes(x=Freq)) +
geom_histogram(binwidth=10) +
labs(title="Histogram of Transaction Frequency of First Six Months of Customers", y = "Density")
ggplot(initial_return, aes(x=Freq)) +
geom_histogram(binwidth=10) +
labs(title="Histogram of Returns Frequency of First Six Months of Customers", y = "Density")
ggplot(initial_sale, aes(x=Freq)) +
geom_histogram(binwidth=10) +
labs(title="Histogram of Sales Frequency of First Six Months of Customers", y = "Density")
#Stock Ordering Frequency of First 6 Months
#Stockcode ordering df for First 6 Months
initial_SC_ordering_table<-table(initial_sale_df$StockCode)
initial_SC_ordering_df<-as.data.frame(initial_SC_ordering_table)
initial_SC_ordering=filter(initial_SC_ordering_df,initial_SC_ordering_df$Freq>0)
#Plot of Stock Ordering First 6 Months
ggplot(initial_SC_ordering, aes(x=Freq)) +
geom_histogram() +
labs(title="Histogram of Item Ordering Frequency of All Products for First Six Months", y = "Density")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Dataframe creation for Customers in First 6 Months
initial_df2<-initial_df%>%
mutate(Income=Quantity*UnitPrice)
initial_sale_df2<-initial_sale_df%>%
mutate(Income=Quantity*UnitPrice)
initial_return_df2<-initial_return_df%>%
mutate(Income=Quantity*UnitPrice)
#Unique Customers in First 6 Months
df<-data.frame(unique(initial_df2$CustomerID))
colnames(df)<-'CustomerID'
##Features of Customer Overall Orders
#Total Number of Purchase Orders by Customer
OrderNo <- initial_sale_df%>%
group_by(CustomerID,InvoiceNo)%>%
summarise(OrderNo = n())
## `summarise()` has grouped output by 'CustomerID'. You can override using the `.groups` argument.
Orders_Unique <- OrderNo%>%
group_by(CustomerID)%>%
summarise(Orders_Unique = n())
df <- left_join(df,Orders_Unique,'CustomerID')
#Total Number of Return Orders by Customer
ReturnNo <- initial_return_df%>%
group_by(CustomerID,InvoiceNo)%>%
summarise(ReturnNo = n())
## `summarise()` has grouped output by 'CustomerID'. You can override using the `.groups` argument.
Returns_Unique <- ReturnNo%>%
group_by(CustomerID)%>%
summarise(Returns_Unique = n())
df <- left_join(df,Returns_Unique,'CustomerID')
#Total Number of Invoice's (Purchase and Return) by Customer
InvoiceNo <- initial_df%>%
group_by(CustomerID,InvoiceNo)%>%
summarise(InvoiceNo = n())
## `summarise()` has grouped output by 'CustomerID'. You can override using the `.groups` argument.
Invoice_Unique <- InvoiceNo%>%
group_by(CustomerID)%>%
summarise(Invoice_Unique = n())
df <- left_join(df,Invoice_Unique,'CustomerID')
##Total Item Features
#Total Items Purchased by Customer
Sale_Quantity <- initial_sale_df%>%
group_by(CustomerID)%>%
summarise(Total_Items_Purchased=sum(Quantity))
df <- left_join(df,Sale_Quantity,'CustomerID')
#Total Quantity Per Purchase Basket
df$Quantity_Basket=df$Total_Items_Purchased/df$Orders_Unique
#Total Items Returned by Customer
Return_Quantity <- initial_return_df%>%
group_by(CustomerID)%>%
summarise(Total_Items_Returned=sum(Quantity))
df <- left_join(df,Return_Quantity,'CustomerID')
#Total Quantity Per Return
#df$Quantity_Return=df$Total_Items_Returned/df$Returns_Unique
#Net Total Items by Customer
Net_Quantity <- initial_df%>%
group_by(CustomerID)%>%
summarise(Net_Total_Items=sum(Quantity))
df <- left_join(df,Net_Quantity,'CustomerID')
#Net Total Quantity Per Invoice
#df$Average_Net_Quantity = df$Net_Total_Items/df$Invoice_Unique
##Unique Item Features
#Total Number of Unique Items Purchased by Customer
Purchase <- initial_sale_df%>%
group_by(StockCode,CustomerID)%>%
summarise(Purchase = n())
## `summarise()` has grouped output by 'StockCode'. You can override using the `.groups` argument.
Purchase_Unique <- Purchase%>%
group_by(CustomerID)%>%
summarise(Types_Items_Purchased=n())
df <- left_join(df,Purchase_Unique,'CustomerID')
#Total Number of Unique Items Per Purchase Basket
Unique_Item_Per_Basket <-OrderNo%>%
group_by(CustomerID)%>%
summarise(Unique_Item_Per_Basket = mean(OrderNo))
df <- left_join(df,Unique_Item_Per_Basket,'CustomerID')
#Total Number of Unique Items Returned by Customer
Cancellation <- initial_return_df%>%
group_by(StockCode,CustomerID)%>%
summarise(Cancellation = n())
## `summarise()` has grouped output by 'StockCode'. You can override using the `.groups` argument.
Cancellation_Unique <- Cancellation%>%
group_by(CustomerID)%>%
summarise(Types_Items_Returned=n())
df <- left_join(df,Cancellation_Unique,'CustomerID')
#Total Number of Unique Items Per Return Order
Unique_Item_Per_Return <-ReturnNo%>%
group_by(CustomerID)%>%
summarise(Unique_Item_Per_Return = mean(ReturnNo))
df <- left_join(df,Unique_Item_Per_Return,'CustomerID')
##Total Revenue Features
#Total Sales Revenue per Customer
Sales_Revenue <- initial_sale_df2%>%
group_by(CustomerID)%>%
summarise(Sales_Revenue=sum(Income))
df <- left_join(df,Sales_Revenue,'CustomerID')
#Average Order Value
#df$Average_Order_Value = df$Sales_Revenue/df$Orders_Unique
#Total Return Refund per Customer
Return_Refund <- initial_return_df2%>%
group_by(CustomerID)%>%
summarise(Return_Refund=sum(Income))
df <- left_join(df,Return_Refund,'CustomerID')
#Average Return Value
#df$Average_Return_Value = df$Return_Refund/df$Returns_Unique
#Total Revenue (Sales-Refund) per Customer
df$Total_Revenue=df$Sales_Revenue+df$Return_Refund
#Average Invoice Value
#df$Average_Invoice_Value = df$Total_Revenue/df$Invoice_Unique
##Features for per unit pricing
#Average Unit Price per Item Purchased
Average_Unit_Price_Purchase <- initial_sale_df2%>%
group_by(CustomerID)%>%
summarise(Average_Unit_Price_Purchase=sum(Income)/sum(Quantity))
df <- left_join(df,Average_Unit_Price_Purchase,'CustomerID')
#Average Unit Refund per Item Returned
Average_Unit_Refund_Return <- initial_return_df2%>%
group_by(CustomerID)%>%
summarise(Average_Unit_Refund_Return=sum(Income)/sum(Quantity))
df <- left_join(df,Average_Unit_Refund_Return,'CustomerID')
#Average Unit Revenue per Item in Invoice
#df$Average_Unit_Revenue=df$Total_Revenue/df$Net_Total_Items
#Add Country Designation
Country <- initial_df%>%
group_by(CustomerID)%>%
summarise(Country=unique(Country))
## `summarise()` has grouped output by 'CustomerID'. You can override using the `.groups` argument.
df <- left_join(df,Country,'CustomerID')
##Add Most popular item by Customer
#Most_Purchased_StockCode <- initial_sale_df2%>%
# group_by(CustomerID,StockCode)%>%
# summarise(Sum_Return=sum(Income))
#colnames(Most_Purchased_StockCode)[which(colnames(Most_Purchased_StockCode) %in% c("StockCode") )] <- #c("Most_Purchased_StockCode")
#Most_Purchased_StockCode <- Most_Purchased_StockCode %>%
# group_by(CustomerID) %>% filter(Sum_Return == max(Sum_Return))
#df <- left_join(df,Most_Purchased_StockCode[c('CustomerID','Most_Purchased_StockCode')],'CustomerID')
#Is a customer purchasing the most profitable items
Profitable_items <- initial_sale_df2%>%
group_by(StockCode)%>%
summarise(Total_Monetary_Value=sum(Income))
Profitable_items<-Profitable_items[order(-Profitable_items$Total_Monetary_Value),]
# Top 3 items' StockCode: "23166", "22423", "85123A"
Is_Most_Popular <- initial_sale_df2 %>%
mutate(Is_Most_Popular = case_when(
StockCode == "23166" ~ 1,
StockCode == "22423" ~ 1,
StockCode == "85123A" ~ 1,
TRUE ~ 0
))
Is_Buying_Most_Popular <- Is_Most_Popular%>%
group_by(CustomerID)%>%
summarise(Is_Buying_Most_Popular=sum(Is_Most_Popular))
Is_Buying_Most_Popular <- Is_Buying_Most_Popular %>%
mutate(Is_Buying_Most_Popular =
case_when(
Is_Buying_Most_Popular>=1~1,
Is_Buying_Most_Popular<1~0))
df <- left_join(df,Is_Buying_Most_Popular,'CustomerID')
df[is.na(df)] = 0
##After creating raw data set on customer level, we will start create additional predicting variables. Since we are studying RFM, we need to first calculate for Recency, Frequency, and Monetory value score. # https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17 # quantile method is borrowed from the link above. ###Recency is defined as: quantile of number of month since last purchase within all customers
#Recency calculation for each customer. Number of days since last purchase to the end date of the initial data file (31 May 11)
Recency <- initial_df%>%
group_by(CustomerID)%>%
summarise(Recency=max(initial_df$InvoiceDate)-max(InvoiceDate))
Recency_Quantile <- Recency %>% mutate(Recency_Quantile = ntile(Recency, 100)/100)
df <- left_join(df,Recency_Quantile,'CustomerID')
###Frequency is originally defined as: quantile of number of purchases within the first 6 months within all customers
df <- df %>% mutate(Frequency_Quantile = ntile(Invoice_Unique, 100)/100)
colnames(df)[which(colnames(df) %in% c("Invoice_Unique") )] <- c("Frequency")
###Monetory value is defined as: the highest value of all purchases by the customer expressed as a multiple of some benchmark value.
df <- df %>% mutate(Monetory_Value_Quantile = ntile(Total_Revenue, 100)/100)
colnames(df)[which(colnames(df) %in% c("Total_Revenue") )] <- c("Monetory_Value")
df$RFM_Score <- df$Recency_Quantile+df$Frequency_Quantile+df$Monetory_Value_Quantile
df <- df %>% mutate(RFM_Score = ntile(RFM_Score, 3))
##Dataframe for Last 6 Months Customer Revenue
#Response for Revenue of Customers
temp <- final_df%>%
mutate(Income=Quantity*UnitPrice)
Y_Income <- temp %>%
group_by(CustomerID)%>%
summarise(Y_Income = sum(Income))
##Combined Dataframe for Model
final_data <- inner_join(df,Y_Income,'CustomerID')
#final_data <- na.omit(final_data)
#final_data[, c(1: ncol(final_data))] <- sapply(final_data[, c(1: ncol(final_data))], as.numeric)
#final_data <- final_data[!is.infinite(rowSums(final_data)),]
final_data$Country<-as.factor(final_data$Country)
final_data$Is_Buying_Most_Popular<-as.factor(final_data$Is_Buying_Most_Popular)
final_data[ ,c('CustomerID','Invoice_Unique','Net_Total_Items','RFM_Score')] <- list(NULL)
head(final_data)
## Orders_Unique Returns_Unique Frequency Total_Items_Purchased Quantity_Basket
## 1 6 2 8 681 113.50000
## 2 5 1 6 2025 405.00000
## 3 3 0 3 223 74.33333
## 4 9 4 13 832 92.44444
## 5 10 2 12 2066 206.60000
## 6 6 0 6 582 97.00000
## Total_Items_Returned Types_Items_Purchased Unique_Item_Per_Basket
## 1 -6 60 13.666667
## 2 -1 62 19.000000
## 3 0 11 4.333333
## 4 -25 43 6.444444
## 5 -9 99 17.900000
## 6 0 18 3.333333
## Types_Items_Returned Unique_Item_Per_Return Sales_Revenue Return_Refund
## 1 4 2.0 1671.68 -18.00
## 2 1 1.0 3042.39 -18.00
## 3 0 0.0 580.85 0.00
## 4 4 1.0 2679.59 -63.09
## 5 3 1.5 2786.43 -19.09
## 6 0 0.0 2650.70 0.00
## Monetory_Value Average_Unit_Price_Purchase Average_Unit_Refund_Return
## 1 1653.68 2.454743 3.000000
## 2 3024.39 1.502415 18.000000
## 3 0.00 2.604709 0.000000
## 4 2616.50 3.220661 2.523600
## 5 2767.34 1.348708 2.121111
## 6 0.00 4.554467 0.000000
## Country Is_Buying_Most_Popular Recency Recency_Quantile
## 1 United Kingdom 1 27 days 0.37
## 2 France 0 8 days 0.11
## 3 United Kingdom 1 40 days 0.42
## 4 United Kingdom 0 55 days 0.54
## 5 United Kingdom 0 6 days 0.08
## 6 United Kingdom 0 25 days 0.35
## Frequency_Quantile Monetory_Value_Quantile Y_Income
## 1 0.92 0.91 1425.42
## 2 0.87 0.95 4162.95
## 3 0.63 0.01 367.40
## 4 0.97 0.94 1980.01
## 5 0.97 0.95 2288.27
## 6 0.87 0.01 1976.92
write.csv(final_data,"final_data.csv",row.names = FALSE)